########################################################
# Project:    Talking to the Populist Radical Right
# Task:       The script produces the raw correspondence
#             analysis scores for the German parties
# Author:     Jan Schwalbach (21/07/2022)
########################################################

# Loading packages and data
library(quanteda.textplots)
library(quanteda.textmodels)
library(quanteda)
library(RCurl)
library(XML)
library(stringr)
library(lubridate)
library(caret)
library(e1071)
library(plyr)
library(ggplot2)
library(dplyr)
library(readtext)
library(data.table)
library(wordshoal)
library(DT)
library(xtable)
library(zoo)
library(tm)
library(ggrepel)
library(grid)
library(ngram)

# Corpus and script need to be stored in the same folder and
# the working directory needs to be set to source file location.
load(file="Corpus_Germany.Rdata")


# Deleting independent party members and the speaker
corpusD <- corpusD[corpusD$party != "fraktionslos",]
corpusD <- corpusD[!is.na(corpusD$party),]
corpusD$party <- gsub("BÜNDNIS 90/DIE GRÜNEN", "GREENS", corpusD$party, fixed = FALSE)  
corpusD$party <- gsub("DIE LINKE", "LEFT", corpusD$party, fixed = FALSE)  

### Subsetting the data set

# To get the scores for each type of debate, select one of the three options.

# 1. All speeches: No changes needed

# 2. Immigration subsetting

corpusD1<-corpusD[!corpusD$migrationpres1!="TRUE",]
corpusD2<-corpusD[!corpusD$migrationcount<=2,]
corpusD <- rbind(corpusD1, corpusD2) 
corpusD <- corpusD[!duplicated(corpusD), ]
corpusD<-corpusD[!corpusD$migrationcount<=0,]

# 3. Education subsetting

corpusD1<-corpusD[!corpusD$educationpres1!="TRUE",]
corpusD2<-corpusD[!corpusD$educationcount<=2,]
corpusD <- rbind(corpusD1, corpusD2) 
corpusD <- corpusD[!duplicated(corpusD), ]
corpusD<-corpusD[!corpusD$educationcount<=0,]

# Aggregating all speeches by a party on the debate level

library(plyr)
corpusD$debateunique <- paste(corpusD$date,corpusD$debate)
corpusD$debateunique<-removePunctuation(corpusD$debateunique)
corpusD$debateparty <- paste(corpusD$party,corpusD$debateunique)
corpusnew <- ddply(corpusD, .(debateparty), summarize,
                   date=paste(date,collapse=","),
                   debateunique=paste(debateunique,collapse=","),
                   text=paste(text,collapse=","),
                   type=paste(type,collapse=","),
                   party=paste(party,collapse=","))

corpusnew$party <- gsub("\\,.*","",corpusnew$party, fixed = FALSE)
corpusnew$date <- gsub("\\,.*","",corpusnew$date, fixed = FALSE)
corpusnew$type <- gsub("\\,.*","",corpusnew$type, fixed = FALSE)
corpusnew$debateunique <- gsub("\\,.*","",corpusnew$debateunique, fixed = FALSE)
corpusD <- corpusnew
corpusD <- as.data.frame(corpusD)
corpusD$date <- as.Date(corpusD$date)

# Keeping only debates when more than one party participated

corpusD <- as.data.frame(corpusD)
corpusD$length <- str_count(corpusD$text,'\\w+')
corpusD<-corpusD[corpusD$length>=50,]

corpusD <- corpusD %>% group_by(debateunique) %>% filter(n()>2)
corpusD <- as.data.frame(corpusD)

### To get the scores for all/government/opposition debates,
### select one of the three options

corpusD2017 <- corpusD

# All Debates

corpusD2017 <- corpusD2017[corpusD2017$type != "0",]
corpusDcorpus <- corpus(corpusD2017)

# Government debates

corpusD2017 <- corpusD2017[corpusD2017$type != "0",]
corpusD2017g <- corpusD2017[corpusD2017$type == "government",]
corpusDcorpus <- corpus(corpusD2017g)

# Opposition Debates

corpusD2017 <- corpusD2017[corpusD2017$type != "0",]
corpusD2017o <- corpusD2017[corpusD2017$type == "opposition",]
corpusDcorpus <- corpus(corpusD2017o)

# Creating a dfm (removing punctuation, number, and stopwords)

Germany_token <- tokens(corpusDcorpus,
                        remove_punct = T,
                        remove_symbols = T,
                        remove_numbers = T
                        )

Germany_token <- tokens_select(Germany_token, pattern = stopwords("de"), selection = "remove")

dfm_Germany <- dfm(Germany_token)
                   
dfm_Germany <- dfm_wordstem(dfm_Germany, language = "german")
#dfm_Germany <- dfm_trim(dfm_Germany, min_docfreq = 2)
dfm_Germany <- dfm_group(dfm_Germany, groups = party)


tmod_ca <- textmodel_ca(dfm_Germany)

dat_ca <- data.frame(dim1 = coef(tmod_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(tmod_ca, doc_dim = 2)$coef_document)


textplot_scale1d(tmod_ca)
getwd()
dat_ca

allvalues <- dat_ca
allvalues$party <- tmod_ca$rownames
allvalues$party <- gsub(" [0-9][0-9][0-9][0-9].*", "", allvalues$party, fixed = FALSE)  

# Plotting the respective values for the two most important dimensions

p <- ggplot(allvalues, aes(dim1,dim2))+ scale_fill_manual(values = c( "dodgerblue1", "grey3", "yellow", "green3", "red4", "red")) +
  geom_point(color = 'red') +
  theme_classic(base_size = 20)+ 
  geom_label_repel(aes(label = party,
                       fill = party), color = 'white',
                   size = 5) + theme(legend.position = "none")+ labs(title = "Debates Germany", y="Dimension 2", x = "Dimension 1")
p


### Creating data sets for the analysis for:

# all debates

CA_D_all <- allvalues
names(CA_D_all)[1] <- "CA_D_all_D1"
names(CA_D_all)[2] <- "CA_D_all_D2"

CA_D_all_gov <- allvalues
names(CA_D_all_gov)[1] <- "CA_D_all_gov_D1"
names(CA_D_all_gov)[2] <- "CA_D_all_gov_D2"

CA_D_all_op <- allvalues
names(CA_D_all_op)[1] <- "CA_D_all_op_D1"
names(CA_D_all_op)[2] <- "CA_D_all_op_D2"

Positions_CA_D_ALL <- cbind(CA_D_all,CA_D_all_gov,CA_D_all_op)
save(Positions_CA_D_ALL, file = "CA_D_ALL.Rdata") # save corpus

##### Immigration debates

CA_D_MIG <- allvalues
names(CA_D_MIG)[1] <- "CA_D_MIG_D1"
names(CA_D_MIG)[2] <- "CA_D_MIG_D2"

CA_D_MIG_gov <- allvalues
names(CA_D_MIG_gov)[1] <- "CA_D_MIG_gov_D1"
names(CA_D_MIG_gov)[2] <- "CA_D_MIG_gov_D2"

CA_D_MIG_op <- allvalues
names(CA_D_MIG_op)[1] <- "CA_D_MIG_op_D1"
names(CA_D_MIG_op)[2] <- "CA_D_MIG_op_D2"

Positions_CA_D_MIG <- cbind(CA_D_MIG,CA_D_MIG_gov,CA_D_MIG_op)
save(Positions_CA_D_MIG, file = "CA_D_MIG.Rdata") # save corpus

##### Education debates

CA_D_ED <- allvalues
names(CA_D_ED)[1] <- "CA_D_ED_D1"
names(CA_D_ED)[2] <- "CA_D_ED_D2"

CA_D_ED_gov <- allvalues
names(CA_D_ED_gov)[1] <- "CA_D_ED_gov_D1"
names(CA_D_ED_gov)[2] <- "CA_D_ED_gov_D2"

CA_D_ED_op <- allvalues
names(CA_D_ED_op)[1] <- "CA_D_ED_op_D1"
names(CA_D_ED_op)[2] <- "CA_D_ED_op_D2"

Positions_CA_D_ED <- cbind(CA_D_ED,CA_D_ED_gov,CA_D_ED_op)
save(Positions_CA_D_ED, file = "CA_D_ED.Rdata") # save corpus
